{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a5a3a1a7-97ab-4b7e-970c-a62b8fe1b45e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import wget\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "ebb01d66-a750-4f1b-aba8-65f5ea3e6461",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: SO4\n",
      "Downloading: PO4\n",
      "Downloading: NAG\n",
      "Downloading: HEM\n",
      "Downloading: BME\n",
      "Downloading: EDO\n",
      "Downloading: PLP\n"
     ]
    }
   ],
   "source": [
    "## Download the data\n",
    "DSET_URL = {\n",
    "    \"SO4\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/SO4.txt\",\n",
    "    \"PO4\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/PO4.txt\",\n",
    "    \"NAG\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/NAG.txt\",\n",
    "    \"HEM\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/HEM.txt\",\n",
    "    \"BME\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/BME.txt\",\n",
    "    \"EDO\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/EDO.txt\",\n",
    "    \"PLP\": \"https://webs.iiitd.edu.in/raghava/ccpdb/datasets/PLP.txt\",\n",
    "}\n",
    "\n",
    "for k, v in DSET_URL.items():\n",
    "    print(f\"Downloading: {k}\") \n",
    "    wget.download(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "b408f983-e8ca-44a8-b90d-3877696700ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PDB</th>\n",
       "      <th>chain</th>\n",
       "      <th>sequence</th>\n",
       "      <th>interacting_residues</th>\n",
       "      <th>length</th>\n",
       "      <th>interactor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1914</td>\n",
       "      <td>A</td>\n",
       "      <td>MVLLESEQFLTELTRLFQKCRSSGSVFITLKKYDEGLEPAENKCLL...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>342</td>\n",
       "      <td>BME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1a6j</td>\n",
       "      <td>B</td>\n",
       "      <td>MTNNDTTLQLSSVLNRECTRSRVHCQSKKRALEIISELAAKQLSLP...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>314</td>\n",
       "      <td>BME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1bff</td>\n",
       "      <td>A</td>\n",
       "      <td>KDPKRLYCKNGGFFLRIHPDGRVDGVREKSDPHIKLQLQAEERGVV...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>258</td>\n",
       "      <td>BME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1btc</td>\n",
       "      <td>A</td>\n",
       "      <td>SNMLLNYVPVYVMLPLGVVNVDNVFEDPDGLKEQLLQLRAAGVDGV...</td>\n",
       "      <td>--+-------------------------------------------...</td>\n",
       "      <td>982</td>\n",
       "      <td>BME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1cws</td>\n",
       "      <td>A</td>\n",
       "      <td>SDHRELIGDYSKAFLLQTVDGKHQDLKYISPETMVALLTGKFSNIV...</td>\n",
       "      <td>------------------------+---------------------...</td>\n",
       "      <td>356</td>\n",
       "      <td>BME</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    PDB chain                                           sequence  \\\n",
       "0  1914     A  MVLLESEQFLTELTRLFQKCRSSGSVFITLKKYDEGLEPAENKCLL...   \n",
       "1  1a6j     B  MTNNDTTLQLSSVLNRECTRSRVHCQSKKRALEIISELAAKQLSLP...   \n",
       "2  1bff     A  KDPKRLYCKNGGFFLRIHPDGRVDGVREKSDPHIKLQLQAEERGVV...   \n",
       "3  1btc     A  SNMLLNYVPVYVMLPLGVVNVDNVFEDPDGLKEQLLQLRAAGVDGV...   \n",
       "4  1cws     A  SDHRELIGDYSKAFLLQTVDGKHQDLKYISPETMVALLTGKFSNIV...   \n",
       "\n",
       "                                interacting_residues  length interactor  \n",
       "0  ----------------------------------------------...     342        BME  \n",
       "1  ----------------------------------------------...     314        BME  \n",
       "2  ----------------------------------------------...     258        BME  \n",
       "3  --+-------------------------------------------...     982        BME  \n",
       "4  ------------------------+---------------------...     356        BME  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from Bio import SeqIO\n",
    "\n",
    "def parse_dset(filename: str, write: bool = False) -> pd.DataFrame:\n",
    "    with open(filename) as fasta_file:  # Will close handle cleanly\n",
    "        identifiers = []\n",
    "        chains = []\n",
    "        sequences = []\n",
    "        interactions = []\n",
    "        lengths = []\n",
    "        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)\n",
    "            identifiers.append(seq_record.id[:-1])\n",
    "            chains.append(seq_record.id[-1])\n",
    "            lengths.append(len(seq_record.seq))\n",
    "\n",
    "            parsed_sequence_and_interactions = re.split(';', re.sub(\"\\+|-\", lambda match: ';' + match.group(), str(seq_record.seq), count=1), maxsplit=1)#str(seq_record.seq).split(\"-\", maxsplit=1)\n",
    "            sequences.append(parsed_sequence_and_interactions[0])\n",
    "            interactions.append(parsed_sequence_and_interactions[1])\n",
    "\n",
    "    data = zip(identifiers, chains, sequences, interactions,lengths)\n",
    "    df = pd.DataFrame.from_dict(data)\n",
    "    df.columns = [\"PDB\", \"chain\", \"sequence\", \"interacting_residues\", \"length\"]\n",
    "    df[\"interactor\"] = filename.split(\".\")[0]\n",
    "    \n",
    "    if write:\n",
    "        df.to_csv(filename.split(\".\")[0] + \".csv\")\n",
    "    return df\n",
    "    \n",
    "df = parse_dset(\"BME.txt\", write=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "01c9b42f-70e3-496c-a7f4-5cefbb86376e",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Iterate over datasets\n",
    "dataset_list = [parse_dset(k + \".txt\", write=True) for k in DSET_URL.keys()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "23f8b6c4-fb8c-4090-94a1-f4880b1db6cf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PDB</th>\n",
       "      <th>chain</th>\n",
       "      <th>sequence</th>\n",
       "      <th>interacting_residues</th>\n",
       "      <th>length</th>\n",
       "      <th>interactor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2olg</td>\n",
       "      <td>A</td>\n",
       "      <td>RNRRPELLPNDCGYQVEADKILNGDDTVPEEFPWTAMIGYKNSSNF...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>552</td>\n",
       "      <td>SO4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4fi1</td>\n",
       "      <td>A</td>\n",
       "      <td>MKCRVWSEARVYTNINKQRTEEYWDYENTVIDWSTNTKDYEIENKV...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>742</td>\n",
       "      <td>SO4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4g9f</td>\n",
       "      <td>E</td>\n",
       "      <td>NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>500</td>\n",
       "      <td>SO4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>101m</td>\n",
       "      <td>A</td>\n",
       "      <td>MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>308</td>\n",
       "      <td>SO4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4ah9</td>\n",
       "      <td>A</td>\n",
       "      <td>SSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFL...</td>\n",
       "      <td>---------++---++------------+-+---++-------+--...</td>\n",
       "      <td>308</td>\n",
       "      <td>SO4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7272</th>\n",
       "      <td>5hsj</td>\n",
       "      <td>A</td>\n",
       "      <td>SLKDLDLNALFIGDKAENGQLYKDLLNKLVDEHLGWRKNYIPSDPN...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>1212</td>\n",
       "      <td>PLP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7273</th>\n",
       "      <td>5hxx</td>\n",
       "      <td>A</td>\n",
       "      <td>VSLQDFDAERIGLFHEDIKRKFDELKSKNLKLDLTRGKPSSEQLDF...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>848</td>\n",
       "      <td>PLP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7274</th>\n",
       "      <td>5ijg</td>\n",
       "      <td>A</td>\n",
       "      <td>LLHPETQMLNSEIVEDRLAVYEGAESAALFSSGMSAIATTLFAFVR...</td>\n",
       "      <td>------------------------------+++--+----------...</td>\n",
       "      <td>706</td>\n",
       "      <td>PLP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>5w71</td>\n",
       "      <td>A</td>\n",
       "      <td>IPFDHWPEWPQHSDRTRRKIEEVFQSNRWAISGYWTGEESMERKFA...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>826</td>\n",
       "      <td>PLP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7276</th>\n",
       "      <td>5x03</td>\n",
       "      <td>B</td>\n",
       "      <td>DWISFSHMSSDTDHFPIKSWFRCEQKAASRSYRTLGDMSHPQGIYE...</td>\n",
       "      <td>----------------------------------------------...</td>\n",
       "      <td>730</td>\n",
       "      <td>PLP</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7277 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       PDB chain                                           sequence  \\\n",
       "0     2olg     A  RNRRPELLPNDCGYQVEADKILNGDDTVPEEFPWTAMIGYKNSSNF...   \n",
       "1     4fi1     A  MKCRVWSEARVYTNINKQRTEEYWDYENTVIDWSTNTKDYEIENKV...   \n",
       "2     4g9f     E  NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...   \n",
       "3     101m     A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   \n",
       "4     4ah9     A  SSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFL...   \n",
       "...    ...   ...                                                ...   \n",
       "7272  5hsj     A  SLKDLDLNALFIGDKAENGQLYKDLLNKLVDEHLGWRKNYIPSDPN...   \n",
       "7273  5hxx     A  VSLQDFDAERIGLFHEDIKRKFDELKSKNLKLDLTRGKPSSEQLDF...   \n",
       "7274  5ijg     A  LLHPETQMLNSEIVEDRLAVYEGAESAALFSSGMSAIATTLFAFVR...   \n",
       "7275  5w71     A  IPFDHWPEWPQHSDRTRRKIEEVFQSNRWAISGYWTGEESMERKFA...   \n",
       "7276  5x03     B  DWISFSHMSSDTDHFPIKSWFRCEQKAASRSYRTLGDMSHPQGIYE...   \n",
       "\n",
       "                                   interacting_residues  length interactor  \n",
       "0     ----------------------------------------------...     552        SO4  \n",
       "1     ----------------------------------------------...     742        SO4  \n",
       "2     ----------------------------------------------...     500        SO4  \n",
       "3     ----------------------------------------------...     308        SO4  \n",
       "4     ---------++---++------------+-+---++-------+--...     308        SO4  \n",
       "...                                                 ...     ...        ...  \n",
       "7272  ----------------------------------------------...    1212        PLP  \n",
       "7273  ----------------------------------------------...     848        PLP  \n",
       "7274  ------------------------------+++--+----------...     706        PLP  \n",
       "7275  ----------------------------------------------...     826        PLP  \n",
       "7276  ----------------------------------------------...     730        PLP  \n",
       "\n",
       "[7277 rows x 6 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.concat(dataset_list)\n",
    "df.reset_index(inplace=True, drop=True)\n",
    "df.to_csv(\"PROTEINS_LIGANDS.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23e06c17-7bdd-4d32-bf46-f0764a2d5cb2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
